清洗文本数据:
import re
def clean_text(text):
# 去除特殊字符和标点符号
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
# 将文本转换为小写
cleaned_text = cleaned_text.lower()
return cleaned_text
标记文本数据:
from nltk.tokenize import word_tokenize
def tokenize_text(text):
# 使用NLTK的词标记器将文本分为单词
tokens = word_tokenize(text)
return tokens
标准化文本数据:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def standardize_text(tokens):
# 移除停用词
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
# 进行词干提取
stemmer = PorterStemmer()
standardized_tokens = [stemmer.stem(word) for word in filtered_tokens]
return standardized_tokens
by chat gpt